{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b742fa29",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import re\n",
    "from collections import Counter\n",
    "\n",
    "data_dir = \"../data/V2\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1e589b7c",
   "metadata": {},
   "source": [
    "### Subtask 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a80b017f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train: n=3075\n",
      "dev: n=340\n",
      "test: n=352\n",
      "all: n=3767\n"
     ]
    }
   ],
   "source": [
    "train_splits = ['train','dev']\n",
    "test_splits = ['test']\n",
    "\n",
    "data_dict = {}\n",
    "df = pd.DataFrame()\n",
    "for split in train_splits+test_splits:\n",
    "    data_dict[split] = pd.read_csv(os.path.join(data_dir, f\"{split}_subtask1.csv\"))\n",
    "    df = pd.concat([df, data_dict[split]], axis=0).reset_index(drop=True)\n",
    "    print(f'{split}: n={len(data_dict[split])}')\n",
    "\n",
    "split = 'all'\n",
    "data_dict[split] = df\n",
    "print(f'{split}: n={len(data_dict[split])}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3b1bf709",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['index', 'text', 'label'], dtype='object')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_dict[split].columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "bd7cb2b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk import word_tokenize, sent_tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "90f87a59",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "all's distribution of Causal Label: {1: 1982, 0: 1785}\n",
      "For label=1, n=1982, avg #chars: 191.1, #words: 33.75, #sents: 1.03\n",
      "For label=0, n=1785, avg #chars: 149.59, #words: 26.9, #sents: 1.06\n",
      "For label=all, n=3767, avg #chars: 171.43, #words: 30.5, #sents: 1.04\n",
      "\n",
      "train's distribution of Causal Label: {1: 1624, 0: 1451}\n",
      "For label=1, n=1624, avg #chars: 189.22, #words: 33.44, #sents: 1.02\n",
      "For label=0, n=1451, avg #chars: 148.39, #words: 26.69, #sents: 1.06\n",
      "For label=all, n=3075, avg #chars: 169.95, #words: 30.25, #sents: 1.04\n",
      "\n",
      "dev's distribution of Causal Label: {1: 185, 0: 155}\n",
      "For label=1, n=185, avg #chars: 194.11, #words: 34.41, #sents: 1.04\n",
      "For label=0, n=155, avg #chars: 147.83, #words: 26.85, #sents: 1.04\n",
      "For label=all, n=340, avg #chars: 173.01, #words: 30.96, #sents: 1.04\n",
      "\n",
      "test's distribution of Causal Label: {1: 173, 0: 179}\n",
      "For label=1, n=173, avg #chars: 205.57, #words: 35.93, #sents: 1.03\n",
      "For label=0, n=179, avg #chars: 160.89, #words: 28.67, #sents: 1.11\n",
      "For label=all, n=352, avg #chars: 182.85, #words: 32.24, #sents: 1.07\n"
     ]
    }
   ],
   "source": [
    "for split in ['all']+train_splits+test_splits:\n",
    "    df = data_dict[split].copy()\n",
    "    print(f\"\\n{split}'s distribution of Causal Label:\", dict(Counter(df['label'])))\n",
    "    \n",
    "    for causal_label in [1,0,'all']:\n",
    "        \n",
    "        if causal_label=='all':\n",
    "            tmp_df = df.copy()\n",
    "        else:\n",
    "            tmp_df = df[df['label']==causal_label]\n",
    "        \n",
    "        ##### character length\n",
    "        char_lens = [len(t) for t in tmp_df['text']]\n",
    "\n",
    "        ##### word length\n",
    "        tokenized_texts = [word_tokenize(t) for t in tmp_df['text']]\n",
    "        word_lens = [len(t) for t in tokenized_texts]\n",
    "\n",
    "        ##### sentence length\n",
    "        sentenized_texts = [sent_tokenize(t) for t in tmp_df['text']]\n",
    "        sent_lens = [len(t) for t in sentenized_texts]\n",
    "        \n",
    "        print(f\"For label={causal_label}, n={len(tmp_df)}, avg #chars: {round(np.mean(char_lens),2)}, \"+\\\n",
    "              f\"#words: {round(np.mean(word_lens),2)}, #sents: {round(np.mean(sent_lens),2)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "43041229",
   "metadata": {},
   "source": [
    "### Subtask 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "32a98500",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train: n=1624\n",
      "dev: n=185\n",
      "test: n=173\n",
      "all: n=1982\n"
     ]
    }
   ],
   "source": [
    "train_splits = ['train','dev']\n",
    "test_splits = ['test']\n",
    "\n",
    "train_splits = ['train','dev']\n",
    "test_splits = ['test']\n",
    "\n",
    "data_dict = {}\n",
    "df = pd.DataFrame()\n",
    "for split in train_splits+test_splits:\n",
    "    tmp_df = pd.read_csv(os.path.join(data_dir, f\"{split}_subtask2_grouped.csv\"))\n",
    "    data_dict[split] = tmp_df[tmp_df['num_rs']>0].reset_index(drop=True)\n",
    "    df = pd.concat([df, data_dict[split]], axis=0).reset_index(drop=True)\n",
    "    print(f'{split}: n={len(data_dict[split])}')\n",
    "\n",
    "split = 'all'\n",
    "data_dict[split] = df\n",
    "print(f'{split}: n={len(data_dict[split])}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "9bc88d05",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['corpus', 'doc_id', 'sent_id', 'eg_id', 'index', 'text',\n",
       "       'causal_text_w_pairs', 'num_rs'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_dict[split].columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "446098d4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Example, sentence', 'this is']"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from ast import literal_eval\n",
    "\n",
    "def get_args(text_w_pairs, search_pattern=r'<ARG0>(.*?)</ARG0>', do_join=True):\n",
    "    found = re.findall(search_pattern, text_w_pairs)\n",
    "    if do_join:\n",
    "        return ' '.join(found)\n",
    "    else:\n",
    "        return found\n",
    "\n",
    "get_args('<SIG0>Example, sentence</SIG0> <SIG2>this is</SIG2>.', r'<SIG.>(.*?)</SIG.>', False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "ac5d6ba5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "all\n",
      "# Sentences: 1982\n",
      "# Relations: 2754\n",
      "Avg. # words: 33.75\n",
      "Cause - Avg. # words: 11.74\n",
      "Effect - Avg. # words: 10.74\n",
      "Signal - Avg. # words: 1.46\n",
      "Distribution of signals: Counter({1: 1850, 0: 863, 2: 40, 3: 1})\n",
      "Distribution of num_rs: Counter({1: 1354, 2: 498, 3: 118, 4: 10, 5: 2})\n",
      "Avg # signals per example: 0.7\n",
      "Avg # rels with signals: 0.69\n",
      "\n",
      "train\n",
      "# Sentences: 1624\n",
      "# Relations: 2257\n",
      "Avg. # words: 33.44\n",
      "Cause - Avg. # words: 11.56\n",
      "Effect - Avg. # words: 10.71\n",
      "Signal - Avg. # words: 1.45\n",
      "Distribution of signals: Counter({1: 1515, 0: 711, 2: 30, 3: 1})\n",
      "Distribution of num_rs: Counter({1: 1105, 2: 418, 3: 90, 4: 9, 5: 2})\n",
      "Avg # signals per example: 0.7\n",
      "Avg # rels with signals: 0.68\n",
      "\n",
      "dev\n",
      "# Sentences: 185\n",
      "# Relations: 249\n",
      "Avg. # words: 34.41\n",
      "Cause - Avg. # words: 12.2\n",
      "Effect - Avg. # words: 10.18\n",
      "Signal - Avg. # words: 1.53\n",
      "Distribution of signals: Counter({1: 154, 0: 92, 2: 3})\n",
      "Distribution of num_rs: Counter({1: 133, 2: 40, 3: 12})\n",
      "Avg # signals per example: 0.64\n",
      "Avg # rels with signals: 0.63\n",
      "\n",
      "test\n",
      "# Sentences: 173\n",
      "# Relations: 248\n",
      "Avg. # words: 35.93\n",
      "Cause - Avg. # words: 12.96\n",
      "Effect - Avg. # words: 11.54\n",
      "Signal - Avg. # words: 1.46\n",
      "Distribution of signals: Counter({1: 181, 0: 60, 2: 7})\n",
      "Distribution of num_rs: Counter({1: 116, 2: 40, 3: 16, 4: 1})\n",
      "Avg # signals per example: 0.79\n",
      "Avg # rels with signals: 0.76\n"
     ]
    }
   ],
   "source": [
    "for split in ['all']+train_splits+test_splits:\n",
    "    \n",
    "    print(f'\\n{split}')\n",
    "    print('# Sentences:', len(data_dict[split]))\n",
    "    print('# Relations:', data_dict[split]['num_rs'].sum())\n",
    "    \n",
    "    ##### word length\n",
    "    # Sentence\n",
    "    tokenized_texts = [word_tokenize(t) for t in data_dict[split]['text']]\n",
    "    word_lens = [len(t) for t in tokenized_texts]\n",
    "    print('Avg. # words:', round(np.mean(word_lens),2))\n",
    "    \n",
    "    cause_lens = []\n",
    "    effect_lens = []\n",
    "    signal_counts = []\n",
    "    num_rs_counts = []\n",
    "    signal_lens = []\n",
    "    \n",
    "    for causal_text_w_pairs in data_dict[split]['causal_text_w_pairs']:\n",
    "        \n",
    "        causal_text_w_pairs = list(literal_eval(causal_text_w_pairs))\n",
    "        num_rs_counts.append(len(causal_text_w_pairs))\n",
    "        \n",
    "        for text_w_pairs in causal_text_w_pairs:   \n",
    "            # Cause\n",
    "            cause = get_args(text_w_pairs, search_pattern=r'<ARG0>(.*?)</ARG0>')\n",
    "            cause_lens.append(len(word_tokenize(cause)))\n",
    "\n",
    "            # Effect\n",
    "            effect = get_args(text_w_pairs, search_pattern=r'<ARG1>(.*?)</ARG1>')\n",
    "            effect_lens.append(len(word_tokenize(effect)))\n",
    "\n",
    "            # Signal\n",
    "            signals = get_args(text_w_pairs, r'<SIG.>(.*?)</SIG.>', False) \n",
    "            signal_counts.append(len(signals))\n",
    "            signal_lens.extend([len(word_tokenize(t)) for t in signals])\n",
    "        \n",
    "    print('Cause - Avg. # words:', round(np.mean(cause_lens),2))\n",
    "    print('Effect - Avg. # words:', round(np.mean(effect_lens),2))\n",
    "    print('Signal - Avg. # words:', round(np.mean(signal_lens),2))\n",
    "    print(f'Distribution of signals: {Counter(signal_counts)}')\n",
    "    print(f'Distribution of num_rs: {Counter(num_rs_counts)}')\n",
    "    print('Avg # signals per example:', round(np.mean(signal_counts),2))\n",
    "    print('Avg # rels with signals:', round(np.mean([1 if x>0 else 0 for x in signal_counts]),2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a25275b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
